# ------------------------------------------------------------------
# ' results for footnote 11
# ' process text data, as well as zipcode information
# ' to produce `tess_zip_text.dta` and `tess2016_multiple_choice_zip.dta` files
# ------------------------------------------------------------------
library(here)
library(data.table)
library(rio)
library(dplyr)
library(tidyr)
library(ggplot2)
library(hrbrthemes)

coding_match = fread(here('data','manual_coding','tess_coding_matched.csv'))

# compare % match 
# naive matches 
coding_match[,n_category:=uniqueN(category),by="coding_id"]
coding_match[,id_category:=seq(from=1,to=.N,by=1),by="coding_id"]

mean(coding_match[,matched]) # 63% 
mean(coding_match[id_category==1,p_matched]==1) # 59.1% are hundred percent match
mean(coding_match[id_category==1,p_matched]>0) # 88.9% have at least one match 
mean(coding_match[id_category==1,p_matched]) # 74% have matches 

# reclassify category based on three schemes 
text_match = fread(here('data','manual_coding',"textcoding_idmatch.csv"))

text_raw = merge(x=text_match,y=coding_match,by.x="coding_id",by.y="coding_id",all.x=TRUE)
# -- extract year/id information from text coding id
text_raw[,year := as.integer(substr(as.character(uid),1,4))]
text_raw[,id := as.integer(substr(as.character(uid),5,8))]

# -- coding category match file 
category_match = fread(here('data','manual_coding','coding_category_match.csv'))
category_match[,category := stringr::str_to_title(category)]

# check discrepancy 
#setdiff(unique(coding_match[,category]),category_match[,category])
#setdiff(category_match[,category],unique(coding_match[,category]))

text_all = merge(x = text_raw, y = category_match, by.x = "category", by.y = "category", all.x = TRUE)

tess2016 = import(here('data','processed',"tess_cleaned.dta"))
tess2016 = subset(tess2016, year==2016)

tess2016 <- tess2016[!is.na(tess2016$weight),]
tess2016_zip = tess2016[,c("caseid","zip")]

textset_2016 = text_all[year==2016,c("id","category")]
textset_2016[,politics := as.numeric(category=='Politics And Election')]
textset_2016 = textset_2016[,.(mean_politics = mean(politics,na.rm=TRUE)),by="id"]

textzip_2016 <- merge(x=tess2016_zip,y=textset_2016,by.x="caseid",by.y="id",all.x=TRUE)

textzip <- data.table(textzip_2016,year=2016)
export(textzip,here('data','processed','tess_zip_text.dta'))

#------------------------------------------------------------------------------
# zipcode - multiple choices
#==============================================================================

t0 <- paste0("topic_",c("debate","election","football"))
t1 <- paste0("topic1_",c("genewilder","spacex","chrisbrown","earthquake","hurricane"))
t2 <- paste0("topic2_",c("shawtylo","genniferflowers","arnoldpalmer","bradpitt"))

tess_multiple_zip = tess2016[,c("id","zip",t0,t1,t2)]
export(tess_multiple_zip,here('data','processed','tess2016_multiple_choice_zip.dta'))
